Load raw data, annotate probes using biomaRt, filter probes and load SFARI genes
Filtering criteria:
Probes with no length informatoin in datProbes (5)
Samples from Subject AN03345 (2)
Probes that have no expression on at least half of the samples (30197)
if(!file.exists('./../Data/Gandal_RNASeq.RData')){
# Load csvs
datExpr = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../../Gandal/RNAseq/raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../../Gandal/RNAseq/working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# 3. Filter samples with rowSums <= 40 (at least half of the samples without any expression)
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]
#################################################################################
# DEA using DESeq2
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
ddsSE = DESeqDataSet(se, design =~Diagnosis_)
dds = DESeq(ddsSE)
DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
mutate('logFC'=log2FoldChange, 'adj.P.Val'=padj) %>%
dplyr::select(-log2FoldChange, -padj)
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../../Gandal/RNAseq/working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
save(datExpr, datMeta, datProbes, GO_neuronal, SFARI_genes, DE_info, file='./../Data/Gandal_RNASeq.RData')
rm(getinfo, to_keep, gene_names, mart, counts, dds, ddsSE, GO_annotations, rowRanges, se)
} else load('./../Data/Gandal_RNASeq.RData')
datExpr_backup = datExpr
Number of genes and samples:
dim(datExpr)
## [1] 33478 86
Gene count by SFARI score of remaining genes:
table(SFARI_genes$`gene-score`)
##
## 1 2 3 4 5 6
## 29 82 209 538 191 25
Gene count by Diagnosis and Brain lobe:
t(table(datMeta$Brain_lobe, datMeta$Diagnosis_))
##
## Frontal Temporal Parietal Occipital
## ASD 8 14 14 15
## CTL 13 6 8 8
Relation between SFARI scores and Neuronal functional annotation:
There is one gene in the SFARI list that has score both 3 and 4
Higher SFARI scores have a higher percentage of genes with neuronal functional annotation (makes sense)
Neuronal_SFARI = data.frame('ID'=rownames(datExpr), 'Neuronal'=rownames(datExpr) %in% GO_neuronal$ID) %>%
left_join(SFARI_genes, by='ID')
tbl = table(Neuronal_SFARI$`gene-score`, Neuronal_SFARI$Neuronal, useNA='ifany') %>% t %>% as.data.frame.matrix
rownames(tbl) = c('Non-Neuronal','Neuronal')
tbl
## 1 2 3 4 5 6 NA
## Non-Neuronal 15 44 146 359 129 20 31601
## Neuronal 9 16 42 74 38 4 982
tbl = round(sweep(tbl, 2, colSums(tbl), `/`)*100, 2)
tbl
## 1 2 3 4 5 6 NA
## Non-Neuronal 62.5 73.33 77.66 82.91 77.25 83.33 96.99
## Neuronal 37.5 26.67 22.34 17.09 22.75 16.67 3.01
rm(Neuronal_SFARI, tbl)
make_ASD_vs_CTL_df = function(datExpr){
datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
'mean' = rowMeans(datExpr), 'sd' = apply(datExpr, 1, sd),
'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
left_join(SFARI_genes, by='ID') %>%
dplyr::select(ID, mean, sd, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
mutate('Neuronal'=ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'))
ASD_vs_CTL_SFARI = ASD_vs_CTL %>% filter(!is.na(`gene-score`)) %>%
mutate(Group = as.character(`gene-score`)) %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_Neuro = ASD_vs_CTL %>% mutate(Group = Neuronal) %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_all = ASD_vs_CTL %>% mutate(Group = 'All') %>% dplyr::select(-c(Neuronal,`gene-score`))
ASD_vs_CTL_together = bind_rows(ASD_vs_CTL_SFARI, ASD_vs_CTL_Neuro, ASD_vs_CTL_all) %>%
mutate(Group = ordered(Group, levels=c('1','2','3','4','5','6',
'Neuronal','Non-Neuronal','All'))) %>%
left_join(DE_info, by='ID')
return(ASD_vs_CTL_together)
}
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Mean (left) and Standard Deviation (right) by score') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Mean (left) and Standard Deviation (right) by score') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
There is heterocedasticity in the data, but the variance increases at a lower rate than the mean, so the \(log_2\) transformation may affect the variance
ggplot(ASD_vs_CTL, aes(mean+1, sd+1)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +
scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') +
ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()
The higher the SFARI score, the bigger the difference between the level of expression between Autism and Control
The higher the SFARI score, the smaller the log Fold Change between Autism and Control
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
Transformation: \(log_2(datExpr + 1)\)
Neuronal annotated genes behave most similar to genes with SFARI score 4, 5 and 6
ASD_vs_CTL = make_ASD_vs_CTL_df(log2(datExpr+1))
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
There is still heterocedasticity in the data, since the variance increased at a lower rate than the mean, by applying \(log_2\) transformation to the data, the relation between mean and variance inverted
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +
ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()
The higher the SFARI score, the smaller the difference between the level of expression between Autism and Control
The second plot is the same as with the raw data, but this time it agrees with the left plot
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr_vst = assay(vst_output)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_vst)
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) +
geom_boxplot() + theme_minimal() +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +
ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()
The higher the SFARI score, the smaller the difference between the level of expression between Autism and Control
The second plot is the same as with the raw data, but this time it agrees with the left plot
p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) +
geom_boxplot() + theme_minimal() +
ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
scale_fill_manual(values=gg_colour_hue(9)) +
theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
subplot(p1, p2, nrows=1)
rm(p1, p2)
The higher the SFARI score:
The higher the percentage of the genes with Neuronal functional annotation
The higher the level of expression
The higher the difference between Autism and Control gene expression levels, but only in raw data (probably because of heteroscedasticity)
The lower the log Fold Change